########## EMPIRICAL EXAMPLES IN PYTHON #########

# Specify working directory
import os                                       # load os library
path = '/Causal_Analysis'                       # the path should be changed by the user
os.chdir(path)                                  # change working directory to path

# Chapter 3

# 1
import pandas as pd                                                      # load pandas library
df = pd.read_csv('data/JC.csv')                                          # load JC data
D = df['assignment']                                                     # define treatment (assignment to JC)
Y = df['earny4']                                                         # define outcome (earnings in fourth year)
Y[D==1].mean() - Y[D==0].mean()                                          # compute the ATE

# 2
import pandas as pd                                                      # load pandas library
import statsmodels.api as sm                                             # load statsmodels library
df = pd.read_csv('data/JC.csv')                                          # load JC data
D = df['assignment']                                                     # define treatment (assignment to JC)
Y = df['earny4']                                                         # define outcome (earnings in fourth year)
D = sm.add_constant(D)                                                   # add a constant to get an intercept
ols = sm.OLS(Y, D).fit(cov_type = 'HC0')                                 # run OLS regression with White's heteroskedasticity robust se
print(ols.summary())                                                     # output

# 3
import numpy as np                                                                             # load numpy library
import pandas as pd                                                                            # load pandas library
from sklearn.linear_model import LinearRegression                                              # import linear regression
from typing import Dict                                                                        # import dict
from scipy.stats import norm                                                                   # import norm
def make_bootstraps(data: np.array, n_bootstraps: int=100) -> Dict[str, Dict[str, np.array]]:  # create boot data
    dc = {}                                                                                    # initiates empty dict
    sample_size = data.shape[0]                                                                # get sample size  
    idx = [i for i in range(sample_size)]                                                      # array of index
    for b in range(n_bootstraps):                                                              # iterating over the number of boot.                                    
        sidx   = np.random.choice(idx,replace=True,size=sample_size)                           # random index                     
        b_samp = data.loc[sidx,:]                                                              # select boot sample
        oob_idx = list(set(idx) - set(sidx))                                                   # get index which where not used
        t_samp = np.array([])                                                                  # create empty array
        if oob_idx:                                                                            # if observation where not used
            t_samp = data.loc[oob_idx,:]                                                       # put them in test sample
        dc['boot_'+str(b)] = {'boot':b_samp,'test':t_samp}                                     # add boot sample and test sample to dict.
    return(dc)                                                                                 # return dict.
df = pd.read_csv('data/JC.csv')                                                                # load JC data
df_boot = make_bootstraps(df, 1999)                                                            # run 1999 boot data
coefs = np.array([])                                                                           # create empty array for coefficients
intrs = np.array([])                                                                           # create empty array for intercepts
lr    = LinearRegression()                                                                     # load linear regression method
for b in df_boot:                                                                              # iterate over boot data
    lr.fit(df_boot[b]['boot'].loc[:,'assignment'].values.reshape(-1, 1),df_boot[b]['boot'].loc[:,'earny4'].values.reshape(-1, 1))   # run linear reg.
    coefs = np.concatenate((coefs,lr.coef_.flatten()))                                         # store coefficients
    intrs = np.concatenate((intrs,lr.intercept_.flatten()))                                    # store intercepts
print(f'intercept: {np.mean(intrs)}\n'                                                         # print mean intercept
      f'coeficient: {np.mean(coefs)}')                                                         # print mean coefficient
tstat = np.mean(coefs)/np.std(coefs)                                                           # compute t-stat.
p_val = 2*norm.cdf(-np.absolute(tstat))                                                        # compute p-val.
print(f'tstat: {tstat}\n'                                                                      # print t-stat.
      f'p_val: {p_val}')                                                                       # print p-val.

# 4
import pandas as pd                                                              # load pandas library
import statsmodels.api as sm                                                     # load statsmodels library
df = pd.read_csv('data/wexpect.csv')                                             # load wexpect data
D1 = df['treatmentinformation']                                                  # define first treatment (wage information)
D2 = df['treatmentorder']                                                        # define second treatment (order of questions)
D = pd.DataFrame([D1,D2]).T                                                      # sm.OLS handle only one dataframe
D = sm.add_constant(D)                                                           # add constant to get an intercept
Y = df['wexpect2']                                                               # define outcome (wage expectations)
ols = sm.OLS(Y, D).fit(cov_type = 'HC0')                                         # run OLS regression whith White's heteroskedasticity robust se
print(ols.summary())                                                             # output

# 5
import numpy as np                                                                 # load numpy library
import pandas as pd                                                                # load pandas library
import statsmodels.api as sm                                                       # load statsmodels library
import matplotlib.pyplot as plt                                                    # load matplotlib library
import scipy.stats as st                                                           # load scipy.stats library
df = pd.read_csv('data/marketing.csv')                                             # load marketing data
D = df['newspaper']                                                                # define treatment (newspaper advertising)
Y = df['sales']                                                                    # define outcome (sales)
x_axis = np.linspace(min(D), max(D), num = 100)                                    # define points where we evaluate 
results = sm.nonparametric.KernelReg(Y,D,'c', reg_type = 'lc').fit(x_axis)         # kernel regression
plt.plot(x_axis, results[0], color = 'black')                                      # plot regression function
plt.xlabel('D')                                                                    # label the x axis
plt.ylabel('Y')                                                                    # label the y axis
plt.show()                                                                         # show the plot

# 6
import pandas as pd                                                                  # load pandas library
import statsmodels.api as sm                                                         # load statsmodels library
df = pd.read_csv('data/coffeeleaflet.csv').dropna()                                  # load coffeleaflet data
D = df['treatment']                                                                  # define treatment (leaflet)
Y = df['awarewaste']                                                                 # define outcome (aware of waste production)
X1 = df['mumedu']                                                                    # define first covariate (education of mum)
X2 = df['sex']                                                                       # define second covariate (gender)
V = pd.DataFrame([D,X1,X2]).T                                                        # join parameters (treatment + covariates)
V = sm.add_constant(V)                                                               # add a constant to get an intercept
ols = sm.OLS(Y, V).fit(cov_type = 'HC0')                                             # run OLS regression with White's heteroskedasticity robust se
print(ols.summary())                                                                 # output                                      

# Chapter 4

# 7
import pandas as pd                                                                           # load pandas library
import statsmodels.api as sm                                                                  # load statsmodels library
df = pd.read_csv('data/lalonde.csv').dropna()                                                 # load lalonde data
D = df['treat']                                                                               # define treatment (training)
Y = df['re78']                                                                                # define outcome
X = df[['age', 'educ', 'nodegr', 'married', 'black', 'hisp', 're74', 're75', 'u74', 'u75']]   # define covariates
def demeand(column):                                                                          # handmade demeaned function
    return column.sub(column.mean())
cols = X.columns                                                                              # get columns of covariates
Xdemeaned = X[cols].apply(demeand)                                                            # demeaned X
def interaction_withD(term1):                                                                 # handmade intercation function with treatment
    return term1.mul(D)
DXdemeaned = Xdemeaned[cols].apply(interaction_withD)                                         # intercation of D and demeaned X
DXdemeaned.rename(columns={'age': 'age*treat',                                                # rename interaction terms
                           'educ': 'educ*treat',
                           'nodegr': 'nodegr*treat',
                           'married': 'married*treat',
                           'black': 'black*treat',
                           'hisp': 'hisp*treat',
                           're74': 're74*treat',
                           're75': 're75*treat',
                           'u74': 'u74*treat',
                           'u75': 'u75*treat'}, inplace = True)
V = pd.concat([D, X, DXdemeaned], axis = 1)                                                   # join parameters (treatment, covariates, intercations)
V = sm.add_constant(V)                                                                        # add a constant to get an intercept
ols = sm.OLS(Y, V).fit(cov_type = 'HC0')                                                      # run OLS regression with White's heteroskedasticity robust se
print(ols.summary())                                                                          # output

# 8
from causalinference import CausalModel                                                                         # load causalmodel
import pandas as pd                                                                                             # load pandas library
import numpy as np                                                                                              # load numpy library
df = pd.read_csv('data/lalonde.csv')                                                                            # load lalonde data
Y = np.asarray(df['re78'])                                                                                      # define outcome
D = np.asarray(df['treat'])                                                                                     # define treatment (training)
X = np.asarray(df[['age', 'educ', 'nodegr', 'married', 'black', 'hisp', 're74', 're75', 'u74', 'u75']])         # define covariates
model = CausalModel(Y,D,X)                                                                                      # define causal model
model.est_via_matching(weights = 'inv', matches = 1)                                                            # pair matching
print(model.estimates)                                                                                          # matching output

# 9
model = CausalModel(Y,D,X)                                                                                      # define causal model
model.est_via_matching(weights = 'inv', matches = 3, bias_adj = True)                                           # 1:M matching
print(model.estimates)                                                                                          # matching output

# 10
from causalinference import CausalModel
import pandas as pd                                                                                             # load pandas library
import numpy as np                                                                                              # load numpy library
import statsmodels.api as sm                                                                                    # load statsmodels library
df = pd.read_csv('data/lalonde.csv')                                                                            # load lalonde data
Y = df['re78']                                                                                                  # define outcome
D = df['treat']                                                                                                 # define treatment (training)
X = df[['age', 'educ', 'nodegr', 'married', 'black', 'hisp', 're74', 're75', 'u74', 'u75']]                     # define covariates
V = sm.add_constant(X)                                                                                          # add a constant to get an intercept
ps = sm.GLM(D, V, family = sm.families.Binomial()).fit().fittedvalues.astype('float')                           # propensity score by logit
Y = np.asarray(Y)                                                                                               # causal model needs np array
D = np.asarray(D)
ps = np.asarray(ps)      
model = CausalModel(Y,D,ps)                                                                                     # define causal model
model.est_via_matching(weights = 'inv', matches = 1, bias_adj = True)                                           # propensity score matching
print(model.estimates)                                                                                          # show the results

# 11
import numpy as np                                                                             # load numpy library
import pandas as pd                                                                            # load pandas library
from causalinference import CausalModel                                                        # import causal model
import statsmodels.api as sm                                                                   # load statsmodels library
from typing import Dict                                                                        # import dict
from scipy.stats import norm                                                                   # import norm
def make_bootstraps(data: np.array, n_bootstraps: int=100) -> Dict[str, Dict[str, np.array]]:  # create boot data
    dc = {}                                                                                    # initiates empty dict
    sample_size = data.shape[0]                                                                # get sample size  
    idx = [i for i in range(sample_size)]                                                      # array of index
    for b in range(n_bootstraps):                                                              # iterating over the number of boot.                                    
        sidx   = np.random.choice(idx,replace=True,size=sample_size)                           # random index                     
        b_samp = data.loc[sidx,:]                                                              # select boot sample
        oob_idx = list(set(idx) - set(sidx))                                                   # get index which where not used
        t_samp = np.array([])                                                                  # create empty array
        if oob_idx:                                                                            # if observation where not used
            t_samp = data.loc[oob_idx,:]                                                       # put them in test sample
        dc['boot_'+str(b)] = {'boot':b_samp,'test':t_samp}                                     # add boot sample and test sample to dict.
    return(dc)                                                                                 # return dict.
df = pd.read_csv('data/lalonde.csv')                                                           # load lalonde data
df_boot = make_bootstraps(df, 999)                                                             # run 999 boot data
att = np.array([])                                                                             # create empty array for att values
for b in df_boot:                                                                              # iterate over boot data
    Y = df_boot[b]['boot'].loc[:, 're78'].values.reshape(-1,1)                                 # get boot outcome
    D = df_boot[b]['boot'].loc[:, 'treat'].values.reshape(-1,1)                                # get boot treatment
    X = df_boot[b]['boot'][['age', 'educ', 'nodegr', 'married', 'black', 'hisp', 're74', 're75', 'u74', 'u75']].values.reshape(-1,10) # get boot covariates
    V = sm.add_constant(X)                                                                     # add a constant to get an intercept
    ps = sm.GLM(D, V, family = sm.families.Binomial()).fit().fittedvalues.astype('float')      # propensity score by logit
    Y = np.asarray(Y)                                                                          # causal model needs np array
    D = np.asarray(D)
    ps = np.asarray(ps)      
    model = CausalModel(Y,D,ps)                                                                # define causal model
    model.est_via_matching(weights = 'inv', matches = 1, bias_adj = True)                      # propensity score matching
    att = np.concatenate((att,model.estimates['matching']['att'].flatten()))                   # store att value
print(f'att: {np.mean(att)}')                                                                  # print mean att
tstat = np.mean(att)/np.std(att)                                                               # compute t-stat.
p_val = 2*norm.cdf(-np.absolute(tstat))                                                        # compute p-val.
print(f'tstat: {tstat}\n'                                                                      # print t-stat.
      f'p_val: {p_val}')                                                                       # print p-val.


# 12
import pandas as pd                                                                           # load pandas library
import dowhy as dw                                                                            # load dowhy library
import numpy as np                                                                            # load numpy library
df = pd.read_csv('data/lbw.csv')                                                              # load lbw data
df['race'] = np.where(df['race'] == 1, 1, 0)                                                  # define race as a binary indicator for white/black
model = dw.CausalModel(data = df,                                                             # define causal model on the data                                      
                       treatment = 'smoke',                                                   # define treatment (mother smoking)
                       outcome = 'bwt',                                                       # define outcome (birthweight in grams)
                       common_causes = ['race', 'age', 'lwt', 'ptl', 'ht', 'ui', 'ftv'])      # define covariates
identified_estimand = model.identify_effect()                                                 # identify the causal effect to be estimated
estimate = model.estimate_effect(identified_estimand,                                         # inverse IPW
                                 method_name = 'backdoor.propensity_score_weighting',         # specify the method
                                 target_units = 'ate',                                        # specify the target
                                 method_params = {'weighting_scheme': 'ips_weight'})          # specify the method parameters
print(estimate.value)                                                                         # show ATE
print(estimate.get_standard_error())                                                          # show standard error
print(estimate.test_stat_significance())                                                      # show p-value

# 13
'''
no Python code available
'''

# 14
import pandas as pd                                             # load pandas library
import doubleml as dml                                          # load doubleml library
from sklearn import linear_model                                # load linear_model from sklearn
df = pd.read_csv('data/lbw.csv')                                # load lbw data
df['race'] = np.where(df['race'] == 1, 1, 0)                    # define race as a binary indicator for white/black
X = df.loc[:, ['race', 'age', 'lwt', 'ptl', 'ht', 'ui', 'ftv']] # select covariates
dml_data = dml.DoubleMLData(data = df,                          # create dml_data from lbw data
                            y_col = 'bwt',                      # select outcome
                            d_cols = 'smoke',                   # select treatment
                            x_cols = list(X.columns.values))    # select covariates
ml_l = linear_model.LinearRegression()                          # define learner for E[Y|X]
ml_m = linear_model.LogisticRegression(max_iter = 350)          # define learner for E[D|X]
dr = dml.DoubleMLPLR(dml_data,                                  # double machine learning
                     ml_l,                                      # specify outcome model
                     ml_m).fit()                                # specify treatment model, fit the model
print(dr.summary)                                               # show the results

# 15
import pandas as pd                                                          # load pandas library
import statsmodels.api as sm                                                 # load statsmodels library
import numpy as np                                                           # load numpy library
import matplotlib.pyplot as plt                                              # load matplotlib library
df = pd.read_csv('data/lbw.csv')                                             # load lbw data
df['race'] = np.where(df['race'] == 1, 1, 0)                                 # define race as a binary indicator for white/black
D = df['smoke']                                                              # define treatment (mother smoking)
Y = df['bwt']                                                                # define outcome (birthweight in grams)
X = df[['race', 'age', 'lwt', 'ptl', 'ht', 'ui', 'ftv']]                     # define covariates
V = sm.add_constant(X)                                                       # add a constant to get an intercept
ps = sm.GLM(D, V, family = sm.families.Binomial()).fit().fittedvalues        # propensity score by logit
df['ps'] = ps                                                                # add propensity score to dataframe
df_treated = df.loc[df['smoke'] == 1]                                        # select treated group
df_nontreated = df.loc[df['smoke'] == 0]                                     # select non-treated group
psdens1= sm.nonparametric.KDEUnivariate(df_treated['ps']).fit()              # density of propensity score among treated
psdens0 = sm.nonparametric.KDEUnivariate(df_nontreated['ps']).fit()          # density of propensity score among non-treated
fit, axs = plt.subplots(2,2)                                                 # specify a figure with four graphs (2x2)
axs[0, 0].plot(psdens1.density, color = 'black')                             # plot density of ps among treated
axs[0, 0].set_title('Density of ps among treated')                           # set title
axs[0, 0].set(ylabel = 'Density')                                            # set y label
axs[0, 1].plot(psdens0.density, color = 'black')                             # plot density of ps among non-treated
axs[0, 1].set_title('Density of ps among non-treated')                       # set title
axs[0, 1].set(ylabel = 'Density')                                            # set y label
axs[1, 0].hist(df_treated['ps'], color = 'black')                            # plot histogram of ps among treated
axs[1, 0].set_title('Histogram of ps among treated')                         # set title
axs[1, 0].set(ylabel = 'Frequency')                                          # set y label
axs[1, 1].hist(df_nontreated['ps'], color = 'black')                         # plot histogram of ps among non-treated
axs[1, 1].set_title('Histogram of ps among non-treated')                     # set title
axs[1, 1].set(ylabel = 'Frequency')                                          # set y label
plt.show()                                                                   # show plot
psdens1.bw                                                                   # print the bandwidth for treated
psdens0.bw                                                                   # print the bandwidth for non-treated
df_treated['ps'].describe()                                                  # summary stat. for p-scores among treated
df_nontreated['ps'].describe()                                               # summary stat. for p-scores among non-treated

# 16
from causalinference import CausalModel                                      # load causalmodel
import pandas as pd                                                          # load pandas library
import numpy as np                                                           # load numpy library
import statsmodels.api as sm                                                 # load statsmodels library
import matplotlib.pyplot as plt                                              # load matplotlib library
df = pd.read_csv('data/lbw.csv')                                             # load lbw data
df['race'] = np.where(df['race'] == 1, 1, 0)                                 # define race as a binary indicator for white/black
D = df['smoke']                                                              # define treatment (mother smoking)
Y = df['bwt']                                                                # define outcome (birthweight in grams)
X = df[['race', 'age', 'lwt', 'ptl', 'ht', 'ui', 'ftv']]                     # define covariates
V = sm.add_constant(X)                                                       # add a constant to get an intercept
ps = sm.GLM(D, V, family = sm.families.Binomial()).fit().fittedvalues        # propensity score by logit
df['ps'] = ps                                                                # add propensity scores to dataframe
Y = np.asarray(Y)                                                            # causal model needs np array
D = np.asarray(D)
ps = np.asarray(ps)    
model = CausalModel(Y,D,ps)                                                  # define causal model
model.est_via_matching(weights = 'inv', matches = 1)                         # propensity score matching
model.est_propensity()                                                       # estimate prop. scores
df['matched_ps'] = model.propensity['fitted']                                # add matched prop. scores to dataframe
df_treated = df.loc[df['smoke'] == 1]                                        # select treated group
df_nontreated = df.loc[df['smoke'] == 0]                                     # select non-treated group
fit, axs = plt.subplots(2,2)                                                 # specify a figure with four graphs (2x2)
axs[0, 0].hist(df_treated['ps'], color = 'black')                            # plot ps among treated
axs[0, 0].set_title('Raw treated')                                           # set title
axs[0, 0].set(ylabel = 'Proportion')                                         # set y label
axs[0, 0].set(xlabel = 'Propensity score')                                   # set x label
axs[0, 1].hist(df_treated['matched_ps'], color = 'black')                    # plot ps after matching among treated
axs[0, 1].set_title('Matched treated')                                       # set title
axs[0, 1].set(ylabel = 'Proportion')                                         # set y label
axs[0, 1].set(xlabel = 'Propensity score')                                   # set x label
axs[1, 0].hist(df_nontreated['ps'], color = 'black')                         # plot ps among non-treated
axs[1, 0].set_title('Raw control')                                           # set title
axs[1, 0].set(ylabel = 'Proportion')                                         # set y label
axs[1, 0].set(xlabel = 'Propensity score')                                   # set x label
axs[1, 1].hist(df_nontreated['matched_ps'], color = 'black')                 # plot ps after matching among non-treated
axs[1, 1].set_title('Matched control')                                       # set title
axs[1, 1].set(ylabel = 'Proportion')                                         # set y label
axs[1, 1].set(xlabel = 'Propensity score')                                   # set x label
plt.show()                                                                   # show plot

# 17
'''
no Python code available
'''

# 18
import pandas as pd                                                          # load pandas library
import numpy as np                                                           # load numpy library
from zepid.causal.ipw import IPTW                                            # load iptw method from zepid library
from zepid.calc import counternull_pvalue                                    # load counternull_pvalue method
df = pd.read_csv('data/lbw.csv')                                             # load lbw data
df['race'] = np.where(df['race'] == 1, 1, 0)                                 # define race as a binary indicator for white/black
ipw = IPTW(df, treatment = 'smoke', outcome = 'ptl')                         # define ipw model
ipw.treatment_model('race + age + lwt + ptl + ht + ui + ftv',                # covariates
                    print_results = False)                                   # don't print results
ipw.marginal_structural_model('smoke')                                       # treatment
ipw.fit()                                                                    # fit the model
ipw.summary()                                                                # show ATE
counternull_pvalue(0.117, -0.274, 0.186)                                     # calculate p-val. based on SE and CI

# 19
'''
no Python code available
'''

# 20
from econml.dml import DML                                                              # load DML
from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression          # load StatsModelsLinearReg.
import pandas as pd                                                                     # load pandas library
df = pd.read_csv('data/games.csv')                                                      # load games data
games_nomis = df.dropna()                                                               # drop missings
games_nomis.loc[:, 'is_action'] = (games_nomis.loc[:, 'genre'] == 'Action').astype(int) # create dummy for genre 'Action'
Y = games_nomis.loc[:,'sales'].values.ravel()                                           # define outcome
D = games_nomis.loc[:,'metascore'].values.ravel()                                       # define treatment
X = games_nomis.loc[:, ['is_action', 'year', 'userscore']]                              # define covariates
est = DML(model_y = 'auto',                                                             # create DML model, WeightedLassoCV for learner E[Y|X]
          model_t = 'auto',                                                             # WeigtedMultiTaskLassoCV for learner E[D|X]
          model_final = StatsModelsLinearRegression(fit_intercept=False),               # learner for fitting the outcome residuals to the treatment residuals
          discrete_treatment = False)                                                   # continuous treatment
est = est.fit(Y = Y, T = D, X = X)                                                      # fit DML model on games data                                       
ATE = est.effect(X = X, T0 = 0, T1 = 100)                                               # estimate treatment effect
lb, ub = est.effect_interval(X = X, T0 = 0, T1 = 100)                                   # estimate interval for treatment effect
df = list(zip(D,ATE, lb, ub))                                                           # join above results
df = pd.DataFrame(df).astype(float)                                                     # create dataframe with them
ate = df.groupby([0]).agg({1: 'mean', 2: 'mean', 3: 'mean'})                            # mean treatment effects and intervals
plt.figure()                                                                            # create the plot
ate[1].plot(label = 'ate', color = 'black', linewidth = 2)                              # add treatment line
ate[2].plot(label = 'lb', color = 'gray', linestyle = '--')                             # add lower bracket
ate[3].plot(label = 'ub', color = 'gray', linestyle = '--')                             # add upper bracket
plt.xlabel('Treatment level A = a')                                                     # set x label
plt.ylabel('E(Y^a)')                                                                    # set y label
plt.show()                                                                              # show the plot

# 21
import pandas as pd                                                              # load pandas library
import numpy as np                                                               # load numpy library
import doubleml as ml                                                            # load doubleml library
import multiprocessing                                                           # load multiprocessing library
from matplotlib import pyplot as plt                                             # load pyplot library
from sklearn.linear_model import LogisticRegression                              # load logistic reg. method
df = pd.read_csv('data/games.csv')                                               # load lbw data
games_nomis = df.dropna()                                                        # drop missings
games_nomis['metascore'] = games_nomis['metascore'] > 75                         # define binary treatment
games_nomis.loc[:, 'is_action'] = (games_nomis['genre'] == 'Action').astype(int) # transform action in binary var.
dml_data = ml.DoubleMLData(games_nomis,                                          # create dml data
                           y_col = 'sales',                                      # define outcome
                           d_cols = 'metascore',                                 # define treatment
                           x_cols = ['year', 'userscore', 'is_action'])          # define covariates
ml_g = LogisticRegression()                                                      # learner for nuisance elements
ml_m = LogisticRegression()                                                      # learner for prop. nuisance fcts
tau_vect = np.arange(0.1, 1, 0.05)                                               # ranks where we want to estiamte QTE
ncores = multiprocessing.cpu_count()                                             # how many cpu computer has
cores_used = np.min([5, ncores-1])                                               # how many cores we will use
qte = ml.DoubleMLQTE(dml_data,                                                   # QTE, specify the data
                     ml_g,                                                       # learner for nuisance elements
                     ml_m,                                                       # learner for prop. nuisance fcts
                     quantiles=tau_vect).fit()                                   # ranks, fit the QTE to our data
print(qte)                                                                       # show results
ci_qte = qte.confint(level = 0.95)                                               # confidence intervals
data = {'Quantile': tau_vect,                                                    # create df for plotting, add ranks
        'QTE': qte.coef,                                                         # add the estimated QTE
        'ci_lower': ci_qte['2.5 %'],                                             # add lower ci
        'ci_upper': ci_qte['97.5 %']}                                            # add upper ci
df = pd.DataFrame(data)                                                          # generate pd dataframe
horiz = np.zeros(len(tau_vect))                                                  # to draw horizontal line
fig, ax = plt.subplots()                                                         # create plot
ax.plot(df['Quantile'], df['QTE'], color = 'black', marker = 'o')                # add QTE line
ax.plot(df['Quantile'], df['ci_lower'], color = 'black', linestyle='--')         # add lower CI
ax.plot(df['Quantile'], df['ci_upper'], color = 'black', linestyle='--')         # add upper CI
ax.plot(df['Quantile'], horiz, color = 'black')                                  # add horizontal line
ax.set_xlabel('Tau')                                                             # set x-axis
ax.set_ylabel('QTE')                                                             # set y-axis
plt.show()                                                                       # show the plot

# 22
'''
no Python code available
'''

# 23
'''
no Python code available
'''

# 24
import pandas as pd                                                              # load pandas library
import statsmodels.api as sm                                                     # load statsmodels library
from statsmodels.stats.mediation import Mediation                                # load mediation model
df = pd.read_csv('data/wexpect.csv')                                             # load wexpect data
outcome_model = sm.OLS.from_formula('wexpect2 ~ male + age + swiss + motherhighedu + fatherhighedu + business + econ + communi + businform', # outcome formula
                                    data = df)                                   # specify the data
mediator_model = sm.OLS.from_formula('business ~ econ + communi + businform + male', # mediator formula
                                     data = df)                                  # specify the data
model = Mediation(outcome_model,                                                 # mediation analysis, add outcome formula
                  mediator_model,                                                # add mediator formula
                  exposure = 'male').fit()                                       # add exposure, fit to our data
model.summary()                                                                  # output

# Chapter 5

# 25
import pandas as pd                                                # load pands library
import doubleml as dml                                             # load doubleml library
from sklearn import linear_model                                   # load linear model from sklearn
df = pd.read_csv('data/JC.csv')                                    # load JC data
X = df.iloc[:, 1:29]                                               # define covariates
dml_data = dml.DoubleMLData(df,                                    # create dml data from JC data
                            y_col = 'health48',                    # define outcome
                            d_cols = 'trainy1',                    # define treatment
                            x_cols = list(X.columns.values))       # define covariates
ml_m = linear_model.LogisticRegressionCV(penalty='l1',             # define learner for E[D|X], norm of the penalty
                                         solver='saga',            # solver for multinomial loss
                                         max_iter=300,             # max. number of iterations
                                         Cs=1,                     # inverse of regularization strength
                                         tol=1e-3)                 # tolerance for stopping criteria
ml_l = linear_model.LassoCV()                                      # define learner for E[Y|X]
dml_lasso = dml.DoubleMLPLR(dml_data,                              # double machine learning, add our data
                            ml_l,                                  # add the learner for E[Y|X]
                            ml_m,                                  # add the learner for E[D|X]
                            n_folds = 3).fit()                     # specify number of folds for cv, fit to our data
print(dml_lasso.summary)                                           # output

# 26
from sklearn import ensemble                                       # load ensemble from sklearn
ml_l = ensemble.RandomForestRegressor()                            # define learner for E[Y|X]
ml_m = ensemble.RandomForestRegressor()                            # define learner for E[D|X]
dml_rfg = dml.DoubleMLPLR(dml_data,                                # double machine learning
                            ml_l,                                  # add the learner for E[Y|X]
                            ml_m,                                  # add the learner for E[D|X]
                            n_folds = 3).fit()                     # specify number of folds for cv, fit to our data
print(dml_rfg.summary)                                             # output

# 27
import pandas as pd                                                                                     # load pands library
import math                                                                                             # import math library
from sklearn.ensemble import RandomForestClassifier                                                     # import rf classifier
from sklearn.ensemble import GradientBoostingRegressor                                                  # import gb regressor
from econml.dml import CausalForestDML                                                                  # import dml causal forest
from scipy.stats import norm                                                                            # import norm
df = pd.read_csv('data/JC.csv')                                                                         # load JC data        
df.columns = df.columns.astype(str)                                                                     # cast column names to string
Y = df.iloc[:, 39]                                                                                      # define outcome (pworky3)
D = df.iloc[:, 36]                                                                                      # define treatment (trainy1)
X = df.iloc[:, 1:29]                                                                                    # define covariates
educ = pd.get_dummies(X['educ'])                                                                        # one-hot-encoding...
hhsize = pd.get_dummies(X['hhsize'])                                                                    # for the scikit-learn implementation...
educmum = pd.get_dummies(X['educmum'])                                                                  # of all categorical variables
educdad = pd.get_dummies(X['educdad'])                                                                  # one-hot-encoding
welfarechild = pd.get_dummies(X['welfarechild'])                                                        # one-hot encoding
health = pd.get_dummies(X['health'])                                                                    # one-hot-encoding
smoke = pd.get_dummies(X['smoke'])                                                                      # one-hot-encoding
alcohol = pd.get_dummies(X['alcohol'])                                                                  # one-hot-encoding
X.drop(columns = ['educ', 'hhsize', 'educmum', 'educdad', 'welfarechild', 'health', 'smoke', 'alcohol'],# delete categorical variables
       axis = 1,                                                                                        # column axis
       inplace = True)                                                                                  # dont return a copy, do it on original df
X = pd.concat([X, educ, hhsize, educmum, educdad, welfarechild, health, smoke, alcohol], axis = 1)      # add their one-hot-encoding
X.columns = X.columns.astype(str)                                                                       # cast column names to string
cf = CausalForestDML(n_estimators = 2000,                                                               # create causalforest object, num. trees in r
                     min_samples_leaf = 5,                                                              # min.node.size in r
                     max_depth = min(round(math.sqrt(len(X.columns))) + 20, len(X.columns)),            # mtry in r
                     model_t = RandomForestClassifier(max_depth = 5,                                    # define learner for E[D|X]
                                                      min_samples_leaf = 10,                            # tune parameters of leaner
                                                      random_state = 123),                              # tune parameters of learner
                     model_y = GradientBoostingRegressor(min_samples_leaf = 30,                         # define learner for E[Y|X]
                                                         n_estimators = 50,                             # tune parameters of learner
                                                         random_state = 123),                           # tune parameters of learner
                     discrete_treatment = True,                                                         # D is binary
                     drate = True)                                                                      # compute ATE at fit time
cf.fit(Y = Y, T = D, X = X)                                                                             # fit causalforest on JC data
ATE = cf.ate_                                                                                           # store ATE
SE = cf.ate_stderr_                                                                                     # store std. error of ATE
pval = 2*norm.cdf(-abs(ATE/SE))                                                                         # compute p-value
print(ATE, SE, pval)                                                                                    # show the results

# 28
import matplotlib.pyplot as plt                             # load matplotlib library
CATE = cf.effect(X)                                         # compute cate on the JC data
plt.hist(CATE, color = 'gray', bins = 16, rwidth = 0.9)     # create histogram of cate
plt.xlabel('CATE')                                          # set x-axis name
plt.ylabel('Frequency')                                     # set y-axis name
plt.title('Histogram of CATE')                              # set title
plt.show()                                                  # show the plot

# 29
import numpy as np                                       # load numpy library
import statsmodels.api as sm                             # load statsmodels library
highCATE = (CATE > np.median(CATE)).astype(int)          # dummy for high CATE
highCATE = sm.add_constant(highCATE)                     # add constant to get an intercept
ols = sm.OLS(df['age'], highCATE).fit(cov_type = 'HC0')  # run OLS regression with White's heteroskedasticity robust se
print(ols.summary())                                     # output

# 30
import pandas as pd                                            # load pandas library
import numpy as np                                             # load numpy library
import doubleml as dml                                         # load doubleml library
from sklearn import ensemble                                   # load ensemble from sklearn
df = pd.read_csv('data/JC.csv')                                # load JC data
X = df.iloc[:, 1:29]                                           # define covariates
dml_data = dml.DoubleMLData(df,                                # create dml_data from JC data
                            y_col = 'pworky3',                 # define outcome
                            d_cols = 'trainy1',                # define treatment
                            x_cols = list(X.columns.values))   # define covariates
ml_g = ensemble.RandomForestRegressor(max_features = 'sqrt')   # outcome model E[Y|D,X]
ml_m = ensemble.RandomForestClassifier()                       # treatment model E[D|X]
est = dml.DoubleMLIRM(dml_data,                                # doubleml for interactive reg. model
                      ml_g,                                    # add learner for outcome
                      ml_m,                                    # add learner for treatment
                      normalize_ipw = True,                    # normalize ip weights
                      trimming_rule = 'truncate',              # trimming approach
                      trimming_threshold = 0.01).fit()         # threshold for trimming
df['intercept'] = np.ones(len(df))                             # add constant column to get an intercept
cateX = df.loc[:, ['intercept', 'female']]                     # covariates for investigating effect heterogeneity
CATEs = est.cate(cateX)                                        # estimate effect heterogeneity
print(CATEs)                                                   # CATEs by gender

# 31
cf.cate_feature_names()  # show features importance

# 32
import pandas as pd                                            # load pandas library
import numpy as np                                             # load numpy library
import doubleml as dml                                         # load doubleml library
from sklearn import ensemble                                   # load ensemble from sklearn
df = pd.read_csv('data/lalonde.csv')                           # load lalonde data
X = df.drop(['re78','treat'], axis = 1)                        # define covariates
dml_data = dml.DoubleMLData(df,                                # create dml_data from JC data
                            y_col = 're78',                    # define outcome
                            d_cols = 'treat',                  # define treatment
                            x_cols = list(X.columns.values))   # define covariates
ml_g = ensemble.RandomForestRegressor(max_features = 'sqrt')   # outcome model E[Y|D,X]
ml_m = ensemble.RandomForestClassifier()                       # treatment model E[D|X]
est = dml.DoubleMLIRM(dml_data,                                # doubleml for interactive reg. model
                      ml_g,                                    # add learner for outcome
                      ml_m,                                    # add learner for treatment
                      normalize_ipw = True,                    # normalize ip weights
                      trimming_rule = 'truncate',              # trimming approach
                      trimming_threshold = 0.01).fit()         # threshold for trimming
policy_tree = est.policy_tree(features = X, depth = 2)         # policies for 4 subgroups - model from ex. 30
policy_tree.plot_tree()                                        # tree with optimal policy

# Chapter 6

# 33
import pandas as pd                                                # load pands library
df = pd.read_csv('data/JC.csv')                                    # load JC data
Z = df['assignment']                                               # define instrument
D = df['trainy1']                                                  # define treatment
Y = df['earny4']                                                   # define outcome
ITT = Y[Z==1].mean() - Y[Z==0].mean()                              # estimate intention-to-treat effect
first = D[Z==1].mean() - D[Z==0].mean()                            # estimate first stage effect (complier share)
LATE = ITT/first                                                   # compute LATE
print(ITT, first, LATE)                                            # show ITT, first stage effect and LATE

# 34
import statsmodels.api as sm                                       # load statsmodels library
import linearmodels.iv.model as lm                                 # load linearmodels library
import pandas as pd                                                # load pands library
df = pd.read_csv('data/JC.csv')                                    # load JC data
df = sm.add_constant(data = df, prepend = False)                   # add a constant to get an intercept
ivreg = lm.IV2SLS(dependent = df['earny4'],                        # dependent variable
                 endog = df['trainy1'],                            # endogeneous var.
                 exog = df['const'],                               # exogeneous var.
                 instruments = df['assignment'])                   # instrument
LATE = ivreg.fit(cov_type = 'robust')                              # run 2SLS with hetero.-robust se
LATE.summary                                                       # results

# 35
'''
no Python code available
'''

# 36
import numpy as np                                                                                  # load numpy library
import pandas as pd                                                                                 # load pandas library
import doubleml as dml                                                                              # load doubleml library
from sklearn.ensemble import GradientBoostingRegressor                                              # import gb regressor
from sklearn.base import clone                                                                      # import clone
learner = GradientBoostingRegressor()                                                               # define learner as gb regressor
ml_l = clone(learner)                                                                               # learner for E[Y|X]
ml_m = clone(learner)                                                                               # learner for E[Z|X]
ml_r = clone(learner)                                                                               # learner for E[D|X]
np.random.seed(1)                                                                                   # set seed
df = pd.read_csv('data/c401k.csv')                                                                  # load c401k data
X = df.iloc[:, 4:11]                                                                                # covariates                        
obj_dml_data = dml.DoubleMLData(df,                                                                 # create double-machine learning object
                                y_col = 'nettfa',                                                   # define outcome
                                d_cols = 'p401k',                                                   # define treatment
                                x_cols = list(X.columns.values),                                    # define covariates
                                z_cols = 'e401k')                                                   # define instrument
dml_pliv_obj = dml.DoubleMLPLIV(obj_dml_data, ml_l, ml_m, ml_r).fit()                               # fit
print(dml_pliv_obj.summary)                                                                         # print the results

# 37
'''
no Python code available
'''

# Chapter 7

# 38
import numpy as np                                                     # load numpy library
import pandas as pd                                                    # load pandas library
import statsmodels.api as sm                                           # load statsmodels
df = pd.read_csv('data/kielmc.csv')                                    # load kielmc data
Y = df['rprice']                                                       # define outcome
D = df['nearinc']                                                      # define treatment group
T = df['y81']                                                          # define period dummy
df['interact'] = D * T                                                 # treatment-period interaction
x = df[['nearinc', 'y81', 'interact']]                                 # creating x var. with D, T and interact
x = sm.add_constant(x)                                                 # adding constant to get an intercept
cluster_var = df['cbd']                                                # cluster variable
ols = sm.OLS(Y, x).fit(cov_type='cluster',                             # did
                       cov_kwds={'groups': cluster_var})               # with cluster st.error 
print(ols.summary())                                                   # print the results

# 39
'''
no Python code available
'''

# 40
'''
no Python code available
'''

# 41
'''
no Python code available
'''

# Chapter 8

# 42
import pandas as pd                               # load pandas library
import numpy as np                                # load numpy library
from synthdid.synthdid import Synthdid as sdid    # load synthdid library
from synthdid.get_data import california_prop99   # load california data
np.random.seed(1)                                 # set seed
out = sdid(california_prop99(),                   # synthetic did
           unit="State",                          # define unit
           time="Year",                           # define time
           treatment="treated",                   # define treatment
           outcome="PacksPerCapita")              # define outcome
out = out.fit().vcov(method='placebo')            # placebo se 
out.summary().summary2                            # show results

# 43
import pandas as pd                                       # load pandas library
import numpy as np                                        # load numpy library
from synthdid.synthdid import Synthdid as sdid            # load synthdid library
from synthdid.get_data import california_prop99           # load california data
import matplotlib.pyplot as plt                           # load matplotlib library
np.random.seed(1)                                         # set seed
out = sdid(california_prop99(),                           # synthetic did
           unit="State",                                  # define unit
           time="Year",                                   # define time
           treatment="treated",                           # define treatment
           outcome="PacksPerCapita")                      # define outcome
out.weights = np.zeros(19)                                # set weights
out.fit(omega_intercept = False).vcov(method = 'placebo') # placebo se 
print(out.summary().summary2)                             # show results
out.plot_outcomes()                                       # plot results

# 44
from rdrobust import rdrobust,rdplot                                            # load rdrobust library
import pandas as pd                                                             # load pandas library
df = pd.read_csv('data/rdrobust_RDsenate.csv')                                  # load data
R = df.margin                                                                   # running var. = margin of winning
Y = df.vote                                                                     # outcome = vote share of democrats
results = rdrobust(y=Y, x=R)                                                    # sharp rdd
print(results)                                                                  # show results
rdplot(y=Y, x=R)                                                                # plot results

# 45
'''
no Python code available
'''

# 46
import pandas as pd                                              # load pandas library
import numpy as np                                               # load numpy library
from rdrobust import rdrobust                                    # load rdrobust library
df = pd.read_csv('data/rcp.csv')                                 # load rcp data
Y = df['cn']                                                     # outcome
R = df['elig_year']                                              # running var.
D = df['retired']                                                # treatment
results = rdrobust(y=Y, x=R, fuzzy=D)                            # fuzzy rdd
print(results)                                                   # show results

# 47
import pandas as pd                                                   # load pandas library
from rdrobust import rdrobust                                         # load rdrobust library
df = pd.read_csv('data/rkd.csv')                                      # load rdk data
Y = df['pers_total']                                                  # outcome
R = df['forcing']                                                     # running var.
D = df['costequalgrants']                                             # treatment
results = rdrobust(y=Y, x=R, fuzzy=D, deriv=1)                        # fuzzy RKD
print(results)                                                        # show results

# 48
'''
no Python code available
'''

# 49
'''
no Python code available
'''

# 50
'''
no Python code available
'''

# 51
'''
no Python code available
'''

# 52
'''
no Python code available
'''